Experimental Analysis of Cellseq Adipose Tissue Macrophages

In this line of experimentation we had four experimental groups:

  • Low Fat Diet
  • High Fat Diet
  • High Fat Diet + Schistosoma Egg Antigen (SEA)
  • High Fat Diet + Omega-1 Glycoprotein

Two phases of analysis:

  1. Is the transcriptome of SEA and omega-1-treated mice similar to one another, and more similar to LFD-fed mice than HFD-fed mice?
  2. Moreover, we would like to look at most differentially changed mRNAs in SEA-treated and omega-1-treated mice compared to näïve HFD-fed mice, to have some hints at which pathways these macrophages may use to improve tissue-specific insulin sensitivity. A gene ontology analysis might provide useful insight in what's happening in these macrophages.

Data Preparation

In [1]:
#Data Handling
import numpy as np
import pandas as pd
from sklearn.preprocessing import minmax_scale
#Visualization
from matplotlib import pyplot as plt
%matplotlib inline
from IPython.display import display
import seaborn as sns
import plotly.offline as plotly
plotly.offline.init_notebook_mode()
from plotly.graph_objs import *
import plotly.figure_factory as FF
#Feature extraction
from sklearn.decomposition import PCA
from scipy.cluster.hierarchy import linkage, dendrogram
from sklearn.cluster.bicluster import SpectralBiclustering, SpectralCoclustering
#Clustering
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.decomposition import NMF
#Misc
from scipy.signal import argrelextrema
from math import floor
from os import listdir
In [2]:
metadata = pd.read_csv("sample_sheet_Patrick_AdiposeMacs.txt", sep="\t")
metadata["#id"] = metadata["#id"].astype(str)
metadata
Out[2]:
#id flocell series lane il_barcode cel_barcode project
0 2 CALW2ANXX LFD L001 pool 2 Patrick_AdiposeMacs
1 6 CALW2ANXX HFD L001 pool 3 Patrick_AdiposeMacs
2 7 CALW2ANXX HFD L001 pool 4 Patrick_AdiposeMacs
3 11 CALW2ANXX HFD-SEA L001 pool 5 Patrick_AdiposeMacs
4 12 CALW2ANXX HFD-SEA L001 pool 6 Patrick_AdiposeMacs
5 13 CALW2ANXX HFD-SEA L001 pool 7 Patrick_AdiposeMacs
6 14 CALW2ANXX HFD-SEA L001 pool 8 Patrick_AdiposeMacs
7 16 CALW2ANXX HFD-pLex L001 pool 9 Patrick_AdiposeMacs
8 17 CALW2ANXX HFD-pLex L001 pool 10 Patrick_AdiposeMacs
9 18 CALW2ANXX HFD-pLex L001 pool 11 Patrick_AdiposeMacs
10 20 CALW2ANXX HFD-pLex L001 pool 12 Patrick_AdiposeMacs
In [3]:
#Import data from files
study_data = None
for fname in listdir("htseq_count/"):
    exp_id_long = fname.split("_")[4].split(".")[0]
    exp_id = exp_id_long.lstrip("0")
    exp_data = pd.read_csv("htseq_count/"+fname, sep="\t", header=None, names=["gene", str(exp_id)])
    if study_data is None:
        study_data = exp_data
    else:      
        study_data = study_data.merge(exp_data, on="gene", how="outer")
#Clean up        
study_data.index = study_data["gene"].values
del study_data["gene"]
study_data = study_data.T
study_data = study_data.merge(metadata, left_index=True, right_on="#id")
study_data.index = study_data.apply(lambda x: x["#id"] + " " + x["series"], axis=1)
study_data.to_csv("studydata.csv")
sample_readinfo = study_data.iloc[:,-12:-1]
study_data = study_data.iloc[:,:-12]
#Display dfs
display(study_data.head())
display(sample_readinfo.head())
#Sanity check
if study_data.isnull().sum().sum() != 0:
    print("Some genes missing from count file")
0610005C13Rik 0610007P14Rik 0610009B22Rik 0610009L18Rik 0610009O20Rik 0610010B08Rik 0610010F05Rik 0610010K14Rik 0610011F06Rik 0610012G03Rik ... Zxda Zxdb Zxdc Zyg11a Zyg11b Zyx Zzef1 Zzz3 a l7Rn6
16 HFD-pLex 0 28 14 4 16 0 13 127 7 243 ... 6 10 15 0 22 242 8 15 0 30
17 HFD-pLex 0 81 23 17 62 0 64 310 23 598 ... 18 19 48 0 68 649 38 38 0 139
18 HFD-pLex 0 24 11 5 20 0 10 92 10 170 ... 6 2 15 0 14 188 4 8 0 31
20 HFD-pLex 0 17 11 3 16 0 12 66 0 131 ... 5 3 10 0 13 166 23 5 0 26
11 HFD-SEA 0 63 26 3 40 0 28 213 27 400 ... 11 7 39 0 24 388 32 16 0 106

5 rows × 24421 columns

no_feature ambiguous too_low_aQual not_aligned alignment_not_unique #id flocell series lane il_barcode cel_barcode
16 HFD-pLex 389086 8790 3773792 962931 0 16 CALW2ANXX HFD-pLex L001 pool 9
17 HFD-pLex 1161915 26330 9982554 2104229 0 17 CALW2ANXX HFD-pLex L001 pool 10
18 HFD-pLex 254857 6185 2750164 612610 0 18 CALW2ANXX HFD-pLex L001 pool 11
20 HFD-pLex 293660 7165 2463895 619809 0 20 CALW2ANXX HFD-pLex L001 pool 12
11 HFD-SEA 709937 17129 6558013 1738422 0 11 CALW2ANXX HFD-SEA L001 pool 5
In [4]:
count_matrix = minmax_scale(study_data.as_matrix(), axis=0)
heatmap = Data([
    Heatmap(
        x = study_data.columns, 
        y = study_data.index,
        z = count_matrix,
        colorscale='Viridis'
    )
])
layout = Layout(
    title='Gene Expression Heatmap',
    yaxis=dict(type="category"),
    margin=Margin(l=100)
)
fig = Figure(data=heatmap, layout=layout)
plotly.iplot(fig)
C:\Users\jpole\Anaconda3\lib\site-packages\sklearn\utils\validation.py:429: DataConversionWarning:

Data with input dtype int64 was converted to float64.

In [5]:
p = sns.barplot(study_data.index, study_data.sum(axis=1).as_matrix())
p.set_xticklabels(study_data.index, rotation=90)
plt.title("Gene Count Intensity Per Experiment")
plt.show()
In [6]:
norm_count_matrix = study_data.as_matrix() / study_data.sum(axis=1)[:, None]
norm_count_matrix = norm_count_matrix[:, norm_count_matrix.sum(axis=0) > 0]
norm_count_matrix = minmax_scale(norm_count_matrix)
print(count_matrix.shape, norm_count_matrix.shape)
heatmap = Data([
    Heatmap(
        x = study_data.columns, 
        y = study_data.index,
        z = norm_count_matrix,
        colorscale='Viridis'
    )
])
layout = Layout(
    title='Gene Expression Heatmap Normalized By Per Experiment Intensity',
    yaxis=dict(type="category"),
    margin=Margin(l=100)
)
fig = Figure(data=heatmap, layout=layout)
plotly.iplot(fig)
(11, 24421) (11, 15665)

Principal Component Analysis and KMeans Clustering

In [7]:
def plotClusters(comp1, comp2, comp3, treatments, clusters, title=""):
    visrep = Scatter3d(
        x=comp1,
        y=comp2,
        z=comp3,
        text=treatments,
        mode='markers',
        marker=dict(
            size=12,
            color=clusters,
            colorscale='Viridis',
            opacity=0.8,
            line = dict(
                width = 0,
                color = "black"
            )
        )
    )
    fig = Figure(data=[visrep], layout=Layout(title=title))
    plotly.iplot(fig)
In [8]:
#PCA Decomposition to support later visualization
dim_red = PCA(n_components=10)
dim_red_genes = dim_red.fit_transform(norm_count_matrix)
print("% Variance Explained:", dim_red.explained_variance_ratio_.sum()*100)
plotClusters(dim_red_genes.T[0], dim_red_genes.T[1], dim_red_genes.T[2], 
             study_data.index, sample_readinfo["series"].astype("category").cat.rename_categories([0,1,2,3]), 
             "Experimental Groups on 3 PCs")
% Variance Explained: 100.0
In [9]:
#PCA Decomposition to support later visualization
dim_red = PCA(n_components=10)
dim_red_genes = dim_red.fit_transform(norm_count_matrix)
print("% Variance Explained:", dim_red.explained_variance_ratio_.sum()*100)
#K-Means Clustering
clf = KMeans(n_clusters=4)
clf.fit(dim_red_genes, sample_readinfo["series"])
serieshat = clf.predict(dim_red_genes)
plotClusters(dim_red_genes.T[0], dim_red_genes.T[1], dim_red_genes.T[2], study_data.index, serieshat, 
             "Experimental Groups on 3 PCs Colored by KMeans Clustering (n=4)")
% Variance Explained: 100.0

PCA Decomposition to support later visualization

dim_red = PCA(n_components=10) dim_red_genes = dim_red.fit_transform(norm_count_matrix) print("% Variance Explained:", dim_red.explained_varianceratio.sum()*100)

K-Means Clustering

clf = KMeans(n_clusters=5) clf.fit(dim_red_genes, sample_readinfo["series"]) serieshat = clf.predict(dim_red_genes) plotClusters(dim_red_genes.T[0], dim_red_genes.T[1], dim_red_genes.T[2], study_data.index, serieshat, "Experimental Groups on 3 PCs Colored by KMeans Clustering (n=5)")

In [10]:
#NMF Decomposition
nmf_decomp = NMF(n_components=4)
nmf_genes = nmf_decomp.fit_transform(norm_count_matrix, sample_readinfo["series"])
heatmap = Data([
    Heatmap(
        y = study_data.index,
        z = np.asarray(nmf_genes),    
        colorscale = 'Viridis'
    )
])
layout = Layout(
    title='NMF Decomposition of Gene Expression Profiles',
    margin=Margin(l=100)
)
fig = Figure(data=heatmap, layout=layout)
plotly.iplot(fig)
In [11]:
def nmf_cluster(nmf_data, threshold):
    nmf_data = pd.DataFrame(nmf_data)
    return nmf_data.apply(lambda x: x > threshold, axis=0);
step_size = 1.75;
min_val=0
max_val=10
tuning_range = np.arange(min_val, max_val+step_size, step_size);
order = .25*(max_val-min_val)/step_size
tuning_data = [(nmf_cluster(nmf_genes, i).sum(axis=1) == 1).sum()/nmf_genes.shape[0] for i in tuning_range]
plt.plot(tuning_range, tuning_data)
lextrema = argrelextrema(np.asarray(tuning_data), np.greater, order=floor(order))[0]
for lmin in tuning_range[lextrema]:
    plt.axvline(x=lmin, color="red", lw=.5)
plt.show()
In [12]:
heatmap = Data([
    Heatmap(
        y = study_data.index,
        z = np.asarray(nmf_cluster(nmf_genes, tuning_range[lextrema[0]])).astype(int),    
        colorscale = 'Viridis'
    )
])
layout = Layout(
    title='NMF Soft Clustering Assignments of Gene Expression Profiles',
    margin=Margin(l=100)
)
fig = Figure(data=heatmap, layout=layout)
plotly.iplot(fig)
In [13]:
#NMF Decomposition
nmf_decomp = NMF(n_components=5)
nmf_genes = nmf_decomp.fit_transform(norm_count_matrix, sample_readinfo["series"])
heatmap = Data([
    Heatmap(
        y = study_data.index,
        z = np.asarray(nmf_genes),    
        colorscale = 'Viridis'
    )
])
layout = Layout(
    title='NMF Decomposition of Gene Expression Profiles',
    margin=Margin(l=100)
)
fig = Figure(data=heatmap, layout=layout)
plotly.iplot(fig)
def nmf_cluster(nmf_data, threshold):
    nmf_data = pd.DataFrame(nmf_data)
    return nmf_data.apply(lambda x: x > threshold, axis=0);
step_size = 1.75;
min_val=0
max_val=10
tuning_range = np.arange(min_val, max_val+step_size, step_size);
order = .25*(max_val-min_val)/step_size
tuning_data = [(nmf_cluster(nmf_genes, i).sum(axis=1) == 1).sum()/nmf_genes.shape[0] for i in tuning_range]
plt.plot(tuning_range, tuning_data)
lextrema = argrelextrema(np.asarray(tuning_data), np.greater, order=floor(order))[0]
for lmin in tuning_range[lextrema]:
    plt.axvline(x=lmin, color="red", lw=.5)
plt.show()
heatmap = Data([
    Heatmap(
        y = study_data.index,
        z = np.asarray(nmf_cluster(nmf_genes, tuning_range[lextrema[0]])).astype(int),    
        colorscale = 'Viridis'
    )
])
layout = Layout(
    title='NMF Soft Clustering Assignments of Gene Expression Profiles',
    margin=Margin(l=100)
)
fig = Figure(data=heatmap, layout=layout)
plotly.iplot(fig)
In [14]:
biclust = SpectralBiclustering(n_clusters=5)
biclust.fit(norm_count_matrix)
fit_data = np.asarray(norm_count_matrix)[np.argsort(biclust.row_labels_)]
fit_data = fit_data[:, np.argsort(biclust.column_labels_)]

heatmap = Data([
    Heatmap(
        y = study_data.index[np.argsort(biclust.row_labels_)], 
        x = study_data.columns[np.argsort(biclust.column_labels_)],
        z = fit_data,    
        colorscale = 'Viridis'
    )
])
layout = Layout(
    title='Spectral BiClustering of Gene Expression Profiles',
    margin=Margin(l=100)
)
fig = Figure(data=heatmap, layout=layout)
plotly.iplot(fig)
In [15]:
sns.distplot(np.std(norm_count_matrix, axis=0))
#Limit features
gene_filter = np.std(norm_count_matrix, axis=0) > .3
limited_study_data = study_data.loc[:, gene_filter]
limited_count_matrix = norm_count_matrix[:, gene_filter]
print(count_matrix.shape, limited_count_matrix.shape)
(11, 24421) (11, 7248)
In [16]:
heatmap = Data([
    Heatmap(
        x = limited_study_data.columns.values, 
        y = limited_study_data.index,
        z = limited_count_matrix,
        colorscale='Viridis'
    )
])
layout = Layout(
    title='Limited Gene Expression Heatmap',
    yaxis=dict(type="category")
)
fig = Figure(data=heatmap, layout=layout)
plotly.iplot(fig)

Hierarchical Clustering

In [17]:
def clusterer(data, classes, labels, affinity_method, linkage_method):
    #Create crosstab
    Hclustering = AgglomerativeClustering(n_clusters=5, affinity=affinity_method, linkage=linkage_method)
    Hclustering.fit(data)
    ms = np.column_stack((classes, Hclustering.labels_))
    df = pd.DataFrame(ms, columns = ["Ground truth","Clusters"])
    display(pd.crosstab(df["Ground truth"], df["Clusters"], margins=True))
    #Plot dendrogram
    z = linkage(data, metric=affinity_method, method=linkage_method)
    dendrogram(z, labels=labels, orientation="left")
    plt.show()
In [18]:
clusterer(norm_count_matrix, sample_readinfo["series"], study_data.index, "euclidean", "ward")
Clusters 0 1 2 3 4 All
Ground truth
HFD 0 2 0 0 0 2
HFD-SEA 0 0 0 4 0 4
HFD-pLex 3 0 1 0 0 4
LFD 0 0 0 0 1 1
All 3 2 1 4 1 11
In [19]:
clusterer(norm_count_matrix, sample_readinfo["series"], study_data.index, "euclidean", "complete")
Clusters 0 1 2 3 4 All
Ground truth
HFD 0 2 0 0 0 2
HFD-SEA 0 0 0 0 4 4
HFD-pLex 3 0 1 0 0 4
LFD 0 0 0 1 0 1
All 3 2 1 1 4 11
In [20]:
import plotly.plotly as py
from plotly.graph_objs import *
import plotly.figure_factory as FF

import numpy as np
from scipy.spatial.distance import pdist, squareform


# get data
data_array = norm_count_matrix
labels = study_data.index

# Initialize figure by creating upper dendrogram
figure = FF.create_dendrogram(data_array, orientation='bottom', labels=labels, 
                              linkagefun=lambda x: linkage(data_array, 'ward', metric='euclidean'))
for i in range(len(figure['data'])):
    figure['data'][i]['yaxis'] = 'y2'

# Create Side Dendrogram
dendro_side = FF.create_dendrogram(data_array, orientation='right',
                                   linkagefun=lambda x: linkage(data_array, 'ward', metric='euclidean'))
for i in range(len(dendro_side['data'])):
    dendro_side['data'][i]['xaxis'] = 'x2'

# Add Side Dendrogram Data to Figure
figure['data'].extend(dendro_side['data'])

# Create Heatmap
dendro_leaves = dendro_side['layout']['yaxis']['ticktext']
dendro_leaves = list(map(int, dendro_leaves))
data_dist = pdist(data_array, metric="euclidean")
heat_data = squareform(data_dist)
heat_data = heat_data[dendro_leaves,:]
heat_data = heat_data[:,dendro_leaves]
heat_data[heat_data == 0] = None
heatmap = Data([
    Heatmap(
        x = dendro_leaves,
        y = study_data.index[dendro_leaves],
        z = -heat_data,
        colorscale = 'Viridis'
    )
])

heatmap[0]['x'] = figure['layout']['xaxis']['tickvals']
heatmap[0]['y'] = dendro_side['layout']['yaxis']['tickvals']

# Add Heatmap Data to Figure
figure['data'].extend(Data(heatmap))

# Edit Layout
figure['layout'].update({'width':800, 'height':800,
                         'showlegend':False, 'hovermode': 'closest',
                         })
# Edit xaxis
figure['layout']['xaxis'].update({'domain': [.15, 1],
                                  'mirror': False,
                                  'showgrid': False,
                                  'showline': False,
                                  'zeroline': False,
                                  'ticks':""})
# Edit xaxis2
figure['layout'].update({'xaxis2': {'domain': [0, .15],
                                   'mirror': False,
                                   'showgrid': False,
                                   'showline': False,
                                   'zeroline': False,
                                   'showticklabels': False,
                                   'ticks':""}})

# Edit yaxis
figure['layout']['yaxis'].update({'domain': [0, .85],
                                  'mirror': False,
                                  'showgrid': False,
                                  'showline': False,
                                  'zeroline': False,
                                  'showticklabels': False,
                                  'ticks': ""})
# Edit yaxis2
figure['layout'].update({'yaxis2':{'domain':[.825, .975],
                                   'mirror': False,
                                   'showgrid': False,
                                   'showline': False,
                                   'zeroline': False,
                                   'showticklabels': False,
                                   'ticks':""}})

# Plot!
plotly.iplot(figure)
In [ ]:
 
In [21]:
def plot_dendromap(data_array, labels):

    heat_data = np.abs(np.asarray(data_array))

    # Initialize figure by creating upper dendrogram
    figure = FF.create_dendrogram(heat_data, orientation='right', labels=labels)
    for i in range(len(figure['data'])):
        figure['data'][i]['xaxis'] = 'y2'

    # Create Side Dendrogram
    dendro_side = FF.create_dendrogram(heat_data, orientation='right')
    for i in range(len(dendro_side['data'])):
        dendro_side['data'][i]['xaxis'] = 'x2'

    # Add Side Dendrogram Data to Figure
    #figure['data'].extend(dendro_side['data'])

    # Create Heatmap
    dendro_leaves = dendro_side['layout']['yaxis']['ticktext']
    dendro_leaves = list(map(int, dendro_leaves))
    
    #Means of calculating hierarchy based on obs. distance
    #heat_data = pdist(data_array)
    #squareform(data_dist)
    
    #Reoranize the heatmap based upon the dendrogram
    heat_data = heat_data[dendro_leaves,:]
    #heat_data = heat_data[:,dendro_leaves]

    heatmap = Data([
        Heatmap(
            y = dendro_leaves,
            z = heat_data,    
            colorscale = 'Viridis'
        )
    ])

    #heatmap[0]['x'] = figure['layout']['xaxis']['tickvals']
    heatmap[0]['y'] = dendro_side['layout']['yaxis']['tickvals']

    # Add Heatmap Data to Figure
    figure['data'].extend(Data(heatmap))

    # Edit Layout
    figure['layout'].update({'width':800, 'height':800,
                             'showlegend':False, 'hovermode': 'closest',
                             })
    # Edit xaxis
    figure['layout']['xaxis'].update({'domain': [.15, 1],
                                      'mirror': False,
                                      'showgrid': False,
                                      'showline': False,
                                      'zeroline': False,
                                      'ticks':""})
    # Edit xaxis2
    figure['layout'].update({'xaxis2': {'domain': [0, .15],
                                       'mirror': False,
                                       'showgrid': False,
                                       'showline': False,
                                       'zeroline': False,
                                       'showticklabels': False,
                                       'ticks':""}})

    # Edit yaxis
    figure['layout']['yaxis'].update({'domain': [0, .85],
                                      'mirror': False,
                                      'showgrid': False,
                                      'showline': False,
                                      'zeroline': False,
                                      'showticklabels': False,
                                      'ticks': ""})
    # Edit yaxis2
    figure['layout'].update({'yaxis2':{'domain':[.825, .975],
                                       'mirror': False,
                                       'showgrid': False,
                                       'showline': False,
                                       'zeroline': False,
                                       'showticklabels': False,
                                       'ticks':""}})

    # Plot!
    plotly.iplot(figure)
In [22]:
plot_dendromap(norm_count_matrix, study_data.index)

Pairwise Distance Mapping

In [23]:
from sklearn.metrics.pairwise import pairwise_distances
from numpy import seterr,isneginf,array

dist_matrix = pairwise_distances(norm_count_matrix, metric="euclidean")
dist_matrix[dist_matrix==0] = None

heatmap = Data([
    Heatmap(
        y = study_data.index, 
        x = study_data.index,
        z = dist_matrix,
        colorscale='Hot'
    )
])
layout = Layout(
    title='Pairwise Euclidean Distances',
    margin=Margin(l=100)
)
fig = Figure(data=heatmap, layout=layout)
plotly.iplot(fig)

Feature Selection

from sklearn.decomposition import SparsePCA sparser = SparsePCA(n_components=4, n_jobs=-1) sparser.fit(norm_count_matrix, studydata.index) nullsum = np.abs(sparser.components).sum(axis=0) > 0 sparsed_labels = study_data.columns[nullsum] sparsedgenes = sparser.components[:, nullsum] heatmap = Data([ Heatmap( x = sparsed_labels, z = sparsed_genes, colorscale='Hot' ) ]) layout = Layout( title='Pairwise Euclidean Distances', margin=Margin(l=100) ) fig = Figure(data=heatmap, layout=layout) plotly.iplot(fig)